Load Packages
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Load Digits Data
#code for digits - ALL
rm(list=ls())
load("UnsupL_SISBID_2024.Rdata")
Create Subset of just 3’s and 8’s
dat38 = rbind(digits[which(rownames(digits)==3),],digits[which(rownames(digits)==8),])
With Centering & Scaling
pc = princomp(dat38) #default - centers and scales
biplot(pc,cex=.7)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
screeplot(pc)
PC Scatterplot
PC1 <- as.matrix(x=pc$scores[,1])
PC2 <- as.matrix(pc$scores[,2])
plot(PC1,PC2,type="n",xlab="PC1",ylab="PC2")
text(PC1,PC2,rownames(dat38),col=rownames(dat38))
Pairs Plot Using ggpairs
PC1 <- as.matrix(x=pc$scores[,1])
PC2 <- as.matrix(pc$scores[,2])
PC3 <- as.matrix(pc$scores[,3])
PC4 <- as.matrix(pc$scores[,4])
PC5<-as.matrix(pc$scores[,5])
pc.df.digits <- data.frame(digit_name = row.names(dat38), PC1, PC2,PC3, PC4, PC5)
ggpairs(pc.df.digits, mapping = aes(color = digit_name))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
PC Loadings
par(mfrow=c(3,5),mar=c(.1,.1,.1,.1))
for(i in 1:15){
imagedigit(pc$loadings[,i])
}
Variance explained
varex = 100*pc$sdev^2/sum(pc$sdev^2)
par(mfrow=c(2,1))
screeplot(pc)
plot(varex,type="l",ylab="% Variance Explained",xlab="Component")
Cumulative variance explained
#cumulative variance explained
cvarex = NULL
for(i in 1:ncol(cdat)){
cvarex[i] = sum(varex[1:i])
}
plot(cvarex,type="l",ylab="Cumulative Variance Explained",xlab="Component", main = "Principal Component vs. Variance Explained" )
Without Centering & Scaling
svdd = svd(dat38)
U = svdd$u
V = svdd$v #PC loadings
D = svdd$d
Z = dat38%*%V #PCs
PC Scatterplots
PC1 <- U[,1]
PC2 <- U[,2]
plot(PC1,PC2,type="n",xlab="PC1",ylab="PC2")
text(PC1,PC2,rownames(dat38),col=rownames(dat38))
Pairs Plot Using ggpairs
PC1 <- U[,1]
PC2 <- U[,2]
PC3 <- U[,3]
PC4 <- U[,4]
PC5 <- U[,5]
pc.df.digits <- data.frame(digit_name = row.names(dat38), PC1, PC2,PC3, PC4, PC5)
ggpairs(pc.df.digits, mapping = aes(color = digit_name))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
PC Loadings
par(mfrow=c(3,5),mar=c(.1,.1,.1,.1))
for(i in 1:15){
imagedigit(V[,i])
}
Variance Explained
#Variance Explained
varex = 0
cumvar = 0
denom = sum(D^2)
for(i in 1:256){
varex[i] = D[i]^2/denom
cumvar[i] = sum(D[1:i]^2)/denom
}
Screeplot
par(mfrow=c(1,2))
plot(1:256,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
plot(1:256,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")
Load in Packages
library(ISLR)
data(College)
cdat = College[,2:18]
dim(cdat)
## [1] 777 17
names(cdat)
## [1] "Apps" "Accept" "Enroll" "Top10perc" "Top25perc"
## [6] "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board" "Books"
## [11] "Personal" "PhD" "Terminal" "S.F.Ratio" "perc.alumni"
## [16] "Expend" "Grad.Rate"
pc = princomp(cdat) #default - centers and scales
#Go back and display these plots side by side
biplot(pc,cex=.7)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
screeplot(pc)
scatter plots - patterns among observations
PC1 <- as.matrix(x=pc$scores[,1])
PC2 <- as.matrix(pc$scores[,2])
PC <- data.frame(State = row.names(cdat), PC1, PC2)
ggplot(PC, aes(PC1, PC2)) +
geom_text(aes(label = State), size = 3) +
xlab("PC1") +
ylab("PC2") +
ggtitle("First Two Principal Components of College Data")
Pairs Plot
comp_labels<-c("PC1","PC2","PC3","PC4", "PC5")
pairs(pc$scores[,1:5], labels = comp_labels, main = "Pairs of PC's for College Data")
Loadings - variables that contribute to these patterns
par(mfrow=c(2,1))
barplot(pc$loadings[,1],cex.names=.6,main="PC 1 Loadings")
barplot(pc$loadings[,2],cex.names=.6,main="PC 2 Loadings")
Variance explained
varex = 100*pc$sdev^2/sum(pc$sdev^2)
par(mfrow=c(2,1))
screeplot(pc)
plot(varex,type="l",ylab="% Variance Explained",xlab="Component")
Cumulative variance explained
#cumulative variance explained
cvarex = NULL
for(i in 1:ncol(cdat)){
cvarex[i] = sum(varex[1:i])
}
plot(cvarex,type="l",ylab="Cumulative Variance Explained",xlab="Component", main = "Principal Component V. Variance Explained" )